#!/usr/bin/perl
#
# Licensed to the Apache Software Foundation (ASF) under one
# or more contributor license agreements.  See the NOTICE file
# distributed with this work for additional information
# regarding copyright ownership.  The ASF licenses this file
# to you under the Apache License, Version 2.0 (the
# "License"); you may not use this file except in compliance
# with the License.  You may obtain a copy of the License at
#
#   http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing,
# software distributed under the License is distributed on an
# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
# KIND, either express or implied.  See the License for the
# specific language governing permissions and limitations
# under the License.
#
# SLZY_HDR_END

use POSIX;
use Pod::Usage;
use Getopt::Long;
use Data::Dumper;
#use JSON;
use strict;
use warnings;

# SLZY_POD_HDR_BEGIN
# WARNING: DO NOT MODIFY THE FOLLOWING POD DOCUMENT:
# Generated by sleazy.pl version 6 (release Mon Aug 20 12:30:03 2012)
# Make any changes under SLZY_TOP_BEGIN/SLZY_LONG_BEGIN

=head1 NAME

B<caqltrack.pl> - track caql usage in csv files

=head1 VERSION

 This document describes version 5 of caqltrack.pl, released
 Wed Nov  7 19:51:35 2012.

=head1 SYNOPSIS

B<caqltrack.pl> 

Options:

    -help       brief help message
    -man        full documentation
    -connect    psql connect parameters
    -csv        process csv (log) files and extract/pre-aggregate caql usage statistics
    -missing    process aggregated output and list missing entries
    -aggregate  process pre-agg output from caqltrack to produce final statistics

=head1 OPTIONS

=over 8

=item B<-help>

    Print a brief help message and exits.

=item B<-man>

    Prints the manual page and exits.

=item B<-connect>

     psql connect string, 

 e.g: -connect '-p 11000 template1'

 If this option is supplied, connect to the database to determine the
 location of the data directories, and then find the CSV log files and
 perform phase 1.

=item B<-csv>

     In the first phase, consume the contents of the CSV log files and
 extract the caql references to construct pre-aggregated output with
 reference count statistics.

=item B<-missing>

    
 In the third phase, consume the aggregated output of the second
 phase and list the "missing" functions.

=item B<-aggregate>

     In the second phase, consume the pre-aggregated output of the first
 phase and produce a final set of totals.


=back

=head1 DESCRIPTION

caqltrack processes CSV log files and extracts references to caql
basic functions.  The "csv" phase is designed to iterate over sets of
CSV files (as supplied by xargs) and produce pre-aggregated output.
Since xargs may invoke caqltrack.pl multiple times, we cannot
guarantee that a single invocation of caqltrack will process all the
csv files.  Thus, the first phase uses caqltrack to pre-aggregate the
caql basic function references, and the second phase invokes caqltrack
to consume the pre-aggregated output and produce the final totals.

=head2 USAGE

find [data-directory] -name '*.csv' | xargs perl caqltrack.pl -csv | perl caqltrack.pl -agg 

=head2 OUTPUT

The output of the first phase is sorted by basic query function,
filename, lineno, unique query code, and the "first arg" (if it is an
oid) followed by a reference count, with vertical bar separators, eg:

caql_basic_fn_94|aclchk.c|3141|33|101|6

The output of the first phase is "pre-aggregated", so it may have
multiple rows for a single (query function, filename, lineno) key.

The second phase consumes this output, and produces a final of
fully-aggregated output.  Also, this phase sums all references to a
basic function into a #TOTAL# row:

 caql_basic_fn_85|ruleutils.c|7137|33|101306
 caql_basic_fn_85|tablecmds.c|1464333|101|6
 caql_basic_fn_85|#TOTAL#|0|0|0312

Finally, the second phase output is terminated with grand total and
total unique query counts:

 #GRANDTOTAL#|#TOTAL#|0|0|0|1676343
 #NUMQUERY#|#TOTAL#|0|0|0|105



=head1 CAVEATS/Future Work



=head1 AUTHORS

Apache HAWQ

Address bug reports and comments to: dev@hawq.apache.org

=cut
# SLZY_POD_HDR_END

# SLZY_GLOB_BEGIN
my $glob_id = "";
#my $glob_tabstr = "\t";
#my $glob_tabstr = " " x $glob_tabwidth; 
my $glob_glob2 = {tabwidth => 4, spacedtab => 1, tabstr => " " x 4};
my $glob_glob;
# SLZY_GLOB_END

sub glob_validate
{
	unless ((exists($glob_glob->{csv}) && $glob_glob->{csv}) ||
			(exists($glob_glob->{connect}) && $glob_glob->{connect}) ||
			(exists($glob_glob->{missing}) && $glob_glob->{missing}) ||
			(exists($glob_glob->{aggregate}) && $glob_glob->{aggregate}))
	{
		warn("ERROR: Must specify either CSV (process csv) or Aggregate options");
		pod2usage(-msg => $glob_id, -exitstatus => 1) ;
	}
}

# SLZY_CMDLINE_BEGIN
# WARNING: DO NOT MODIFY THE FOLLOWING SECTION:
# Generated by sleazy.pl version 6 (release Mon Aug 20 12:30:03 2012)
# Make any changes under SLZY_TOP_BEGIN/SLZY_LONG_BEGIN
# Any additional validation logic belongs in glob_validate()

BEGIN {
	    my $s_help      = 0;     # brief help message
	    my $s_man       = 0;     # full documentation
	    my $s_connect;           # psql connect parameters
	    my $s_csv       = 0;     # process csv (log) files and extract/pre-aggregate caql usage statistics
	    my $s_missing   = 0;     # process aggregated output and list missing entries
	    my $s_aggregate = 0;     # process pre-agg output from caqltrack to produce final statistics

	my $slzy_argv_str;
	$slzy_argv_str = quotemeta(join(" ", @ARGV))
		if (scalar(@ARGV));

    GetOptions(
		'help|?'     =>     \$s_help,
		'man'        =>     \$s_man,
		'connect:s'  =>     \$s_connect,
		'csv|log'    =>     \$s_csv,
		'missing'    =>     \$s_missing,
		'aggregate'  =>     \$s_aggregate,
               )
        or pod2usage(2);

	pod2usage(-msg => $glob_id, -exitstatus => 1) if $s_help;
	pod2usage(-msg => $glob_id, -exitstatus => 0, -verbose => 2) if $s_man;
	
	
	$glob_glob = {};
	
	
	# version and properties from json definition
	$glob_glob->{_sleazy_properties} = {};
	$glob_glob->{_sleazy_properties}->{version} = '5';
	$glob_glob->{_sleazy_properties}->{slzy_date} = '1352346695';
	$glob_glob->{_sleazy_properties}->{slzy_argv_str} = $slzy_argv_str;
	
	$glob_glob->{connect}    =  $s_connect      if (defined($s_connect));
	$glob_glob->{csv}        =  $s_csv          if (defined($s_csv));
	$glob_glob->{missing}    =  $s_missing      if (defined($s_missing));
	$glob_glob->{aggregate}  =  $s_aggregate    if (defined($s_aggregate));
	
	glob_validate();


}
# SLZY_CMDLINE_END

# convert a postgresql psql formatted table into an array of hashes
sub tablelizer
{
    my ($ini, $got_line1) = @_;

    # first, split into separate lines, the find all the column headings

    my @lines = split(/\n/, $ini);

    return undef
        unless (scalar(@lines));

    # if the first line is supplied, then it has the column headers,
    # so don't try to find them (or the ---+---- separator) in
    # "lines"
    my $line1 = $got_line1;
    $line1 = shift @lines
        unless (defined($got_line1));

    # look for <space>|<space>
    my @colheads = split(/\s+\|\s+/, $line1);

    # fixup first, last column head (remove leading,trailing spaces)

    $colheads[0] =~ s/^\s+//;
    $colheads[0] =~ s/\s+$//;
    $colheads[-1] =~ s/^\s+//;
    $colheads[-1] =~ s/\s+$//;

    return undef
        unless (scalar(@lines));
    
    shift @lines # skip dashed separator (unless it was skipped already)
        unless (defined($got_line1));
    
    my @rows;

    for my $lin (@lines)
    {
        my @cols = split(/\|/, $lin, scalar(@colheads));
        last 
            unless (scalar(@cols) == scalar(@colheads));

        my $rowh = {};

        for my $colhdcnt (0..(scalar(@colheads)-1))
        {
            my $rawcol = shift @cols;

            $rawcol =~ s/^\s+//;
            $rawcol =~ s/\s+$//;

            my $colhd = $colheads[$colhdcnt];
#            $rowh->{($colhdcnt+1)} = $rawcol;
            $rowh->{$colhd} = $rawcol;
        }
        push @rows, $rowh;
    }

    return \@rows;
}

sub do_conn
{

	my $bigstr = <<'EOF_bigstr';
select gscp.dbid,
gscp.content,
gscp.hostname as hostname, gscp.address as address,
fep.fselocation as loc,
pfs.oid fsoid,
pfs.fsname,
gscp.mode,
gscp.status,
gscp.preferred_role
from
gp_segment_configuration gscp, pg_filespace_entry fep,
pg_filespace pfs
where
fsname = $q$pg_system$q$
and
fep.fsedbid=gscp.dbid
and pfs.oid = fep.fsefsoid
order by 1,2 
EOF_bigstr

	$bigstr .= ' ; ';

    my $psql_str = "psql ";

    $psql_str .= $glob_glob->{connect};

	$psql_str .= "  -c  \'  $bigstr  \'";

	my $tabdef = `$psql_str`;

    my $seg_config_table = tablelizer($tabdef);

#	print Data::Dumper->Dump([$seg_config_table]);

	my %h1;

	for my $rowh (@{$seg_config_table})
    {
        my $dir		 = $rowh->{loc};
		my $hostname = $rowh->{hostname};

        my @foo = File::Spec->splitdir($dir);

        pop @foo;

        $dir = File::Spec->catdir(@foo);

		$h1{$hostname . ":" . $dir} = {hostname => $hostname,
									   dir => $dir};
	}

	for my $hh (sort(keys(%h1)))
	{
		my $dir = $h1{$hh}->{dir};
		my $prog = $0;

		system("find $dir -name '*.csv' | xargs perl $prog -csv ");
#		system("find $dir -name '*.csv'  ");
#		print `find $dir -name '*.csv' `;
		
	}


} # end do_conn

sub do_csv
{
	my %bigh;

	while (<>)
	{
		my $ini = $_;

		next unless ($ini =~ m/catquery.*caql\_basic/);

		# make sure is actual message, not sql string for view:
		# 'catquery: caql_basic_fn_', ''), 'caller: ', ''), ' ') \
		# as caql_mess_arr
		next unless ($ini =~ m/caql\_basic\_fn\_\d+\s+caller/);

#		print $ini;
		my @ggg = split(/catquery: /, $ini);
		my @fff = split(/\,/, $ggg[1]); 
		my $lin = $fff[0];
		die "bad line: $ini" unless (defined($lin) && length($lin));

#		print $lin;
		$lin =~ s/\"//g; # remove quotes;
		# change to vertical bar separated list
		$lin =~ s/\s*caller\:\s*/\|/; 
		$lin =~ s/\s+/\|/g; 

		$bigh{$lin} = 0 unless (exists($bigh{$lin}));
		$bigh{$lin} += 1;
	} # end while

	# output in the form of 
	# <func name>|<filename>|<lineno>|<uniqqno>|<firstarg>|<reference count>, 
	# eg:
	#
	# caql_basic_fn_118|dbcommands.c|2270|33|101|3

	for my $kk (sort(keys(%bigh)))
	{
		print $kk, "|", $bigh{$kk}, "\n";
	}

} # end do_csv

sub do_agg
{
	my %bigh;

	# input in the form of 
	# <func name>|<filename>|<lineno>|<uniqqno>|<firstarg>|<reference count>, 
	# eg:
	#
	# caql_basic_fn_118|dbcommands.c|2270|33|101|3

	# NOTE: since caqltrack -csv is called via xargs, may have
	# multiple sets of pre-aggregated lines, 
	# so split by "key" portion (func, file, lineno, uniqqno, firstarg) and
	# sum the reference counts

	while (<>)
	{
		my $ini = $_;
		my @ggg = split(/\|/, $ini);
		die "bad line: $ini" unless (1 < scalar(@ggg));

		# extract the reference count from the end of the line
		my $num = pop @ggg;
		die "bad line count: $ini" unless ($num =~ m/\d+/);
		
		# convert the "firstarg" to zero to fix aggregation
		$ggg[-1] = 0;

		my $lin = join("|", @ggg);
		$bigh{$lin} = 0 unless (exists($bigh{$lin}));
		$bigh{$lin} += $num; # NOTE: not 1, sum the reference counts

	}

	my $currqry;
	my $numcurr	 = 0;
	my $numqry	 = 0;
	my $grandtot = 0;

	for my $kk (sort(keys(%bigh)))
	{
		my @ggg = split(/\|/, $kk);

		if (defined($currqry))
		{
			if ($currqry eq $ggg[0])
			{
				$numcurr += $bigh{$kk};
			}
			else
			{
				print $currqry, "|#TOTAL#|0|0|0|", $numcurr, "\n";
				$currqry = $ggg[0];
				$numcurr = $bigh{$kk};
				$numqry += 1;
			}
		}
		else
		{
			$currqry = $ggg[0];
			$numcurr = $bigh{$kk};
			$numqry += 1;
		}
		print $kk, "|", $bigh{$kk}, "\n";
		$grandtot += $bigh{$kk};

	} # end for $kk
	if (defined($currqry))
	{
		print $currqry, "|#TOTAL#|0|0|0|", $numcurr, "\n";
	}
	if ($grandtot)
	{
		print "#GRANDTOTAL#|#TOTAL#|0|0|0|", $grandtot, "\n";
		print "#NUMQUERY#|#TOTAL#|0|0|0|", $numqry, "\n";
	}

} # end do_agg

sub do_missing
{
	my $biga = [];

	$biga->[0] = undef;

	# input in the form of 
	# <func name>|#TOTAL#|0|0|0|<reference count>, eg:
	#
	# caql_basic_fn_118|#TOTAL#|0|0|0|3

	while (<>)
	{
		my $ini = $_;

		next unless ($ini =~ m/^caql\_basic\_fn/);
		next unless ($ini =~ m/\#TOTAL\#/);

		my @ggg = ($ini =~ m/^caql\_basic\_fn\_(\d+)/);
		die "bad line: $ini" unless (1 == scalar(@ggg));

		# extract the function number
		my $num = pop @ggg;
		$num--; # convert from 1 to zero based for array
		die "bad fn: $ini" unless ($num >= 0);
		
		$biga->[$num] = $ini;
	}

	my $numMiss = 0;
	for my $qnum (0..(scalar(@{$biga})-1))
	{
		unless (exists($biga->[$qnum]) &&
				defined($biga->[$qnum]))
		{
			$numMiss += 1;
			print "caql_basic_fn_" . ($qnum+1) . "\n";
		}
	}
	print "#MISSING: ", $numMiss, "\n";
	print "#MAXNUM: ", scalar(@{$biga}), "\n";

} # end do_missing

if (1)
{
	if (exists($glob_glob->{connect}) && $glob_glob->{connect})
	{
		do_conn();
	}
	elsif (exists($glob_glob->{csv}) && $glob_glob->{csv})
	{
		do_csv();
	}
	elsif (exists($glob_glob->{aggregate}) && $glob_glob->{aggregate})
	{
		do_agg();
	}
	elsif (exists($glob_glob->{missing}) && $glob_glob->{missing})
	{
		do_missing();
	}
	else
	{
		die "invalid options!";
	}


}

# SLZY_TOP_BEGIN
if (0)
{
    my $bigstr = <<'EOF_bigstr';
{
   "args" : [
      {
         "alias" : "?",
         "long" : "Print a brief help message and exits.",
         "name" : "help",
         "required" : "0",
         "short" : "brief help message",
         "type" : "untyped"
      },
      {
         "long" : "Prints the manual page and exits.",
         "name" : "man",
         "required" : "0",
         "short" : "full documentation",
         "type" : "untyped"
      },
      {
         "long" : "$connlong",
         "name" : "connect",
         "short" : "psql connect parameters",
         "type" : "string"
      },
      {
         "alias" : "log",
         "long" : "$csvlong",
         "name" : "csv",
         "short" : "process csv (log) files and extract/pre-aggregate caql usage statistics",
         "type" : "u"
      },
      {
         "long" : "$missinglong",
         "name" : "missing",
         "short" : "process aggregated output and list missing entries",
         "type" : "u"
      },
      {
         "long" : "$agglong",
         "name" : "aggregate",
         "short" : "process pre-agg output from caqltrack to produce final statistics",
         "type" : "u"
      }
   ],
   "long" : "$toplong",
   "properties" : {
      "slzy_date" : 1352346695
   },
   "short" : "track caql usage in csv files",
   "version" : "5"
}

EOF_bigstr
}
# SLZY_TOP_END

# SLZY_LONG_BEGIN
if (0)
{

	my $toplong = <<'EOF_longstr';
caqltrack processes CSV log files and extracts references to caql
basic functions.  The "csv" phase is designed to iterate over sets of
CSV files (as supplied by xargs) and produce pre-aggregated output.
Since xargs may invoke caqltrack.pl multiple times, we cannot
guarantee that a single invocation of caqltrack will process all the
csv files.  Thus, the first phase uses caqltrack to pre-aggregate the
caql basic function references, and the second phase invokes caqltrack
to consume the pre-aggregated output and produce the final totals.

{HEAD2} USAGE

find [data-directory] -name '*.csv' | xargs perl caqltrack.pl -csv | perl caqltrack.pl -agg 

{HEAD2} OUTPUT

The output of the first phase is sorted by basic query function,
filename, lineno, unique query code, and the "first arg" (if it is an
oid) followed by a reference count, with vertical bar separators, eg:

caql_basic_fn_94|aclchk.c|3141|33|101|6

The output of the first phase is "pre-aggregated", so it may have
multiple rows for a single (query function, filename, lineno) key.

The second phase consumes this output, and produces a final of
fully-aggregated output.  Also, this phase sums all references to a
basic function into a #TOTAL# row:

 caql_basic_fn_85|ruleutils.c|7137|33|101306
 caql_basic_fn_85|tablecmds.c|1464333|101|6
 caql_basic_fn_85|#TOTAL#|0|0|0312

Finally, the second phase output is terminated with grand total and
total unique query counts:

 #GRANDTOTAL#|#TOTAL#|0|0|0|1676343
 #NUMQUERY#|#TOTAL#|0|0|0|105



{HEAD1} CAVEATS/Future Work


EOF_longstr

my $missinglong = <<'EOF_missinglong';

 In the third phase, consume the aggregated output of the second
 phase and list the "missing" functions.
EOF_missinglong

my $agglong = <<'EOF_agglong';
 In the second phase, consume the pre-aggregated output of the first
 phase and produce a final set of totals.
EOF_agglong

my $csvlong = <<'EOF_csvlong';
 In the first phase, consume the contents of the CSV log files and
 extract the caql references to construct pre-aggregated output with
 reference count statistics.
EOF_csvlong

my $connlong = <<'EOF_connlong';
 psql connect string, 

 e.g: -connect '-p 11000 template1'

 If this option is supplied, connect to the database to determine the
 location of the data directories, and then find the CSV log files and
 perform phase 1.
EOF_connlong

}
# SLZY_LONG_END

